Lab 10

Lab 10#

General notebook information

We are going to look at the population count of different community districts over time.

import pandas as pd
import plotly.express as px

url = 'https://data.cityofnewyork.us/resource/h9gi-nx95.csv'
crash_data = pd.read_csv(url)
crash_data.head()
crash_date crash_time borough zip_code latitude longitude location on_street_name off_street_name cross_street_name ... contributing_factor_vehicle_2 contributing_factor_vehicle_3 contributing_factor_vehicle_4 contributing_factor_vehicle_5 collision_id vehicle_type_code1 vehicle_type_code2 vehicle_type_code_3 vehicle_type_code_4 vehicle_type_code_5
0 2021-09-11T00:00:00.000 2:39 NaN NaN NaN NaN NaN WHITESTONE EXPRESSWAY 20 AVENUE NaN ... Unspecified NaN NaN NaN 4455765 Sedan Sedan NaN NaN NaN
1 2022-03-26T00:00:00.000 11:45 NaN NaN NaN NaN NaN QUEENSBORO BRIDGE UPPER NaN NaN ... NaN NaN NaN NaN 4513547 Sedan NaN NaN NaN NaN
2 2022-06-29T00:00:00.000 6:55 NaN NaN NaN NaN NaN THROGS NECK BRIDGE NaN NaN ... Unspecified NaN NaN NaN 4541903 Sedan Pick-up Truck NaN NaN NaN
3 2021-09-11T00:00:00.000 9:35 BROOKLYN 11208.0 40.667202 -73.866500 \n, \n(40.667202, -73.8665) NaN NaN 1211 LORING AVENUE ... NaN NaN NaN NaN 4456314 Sedan NaN NaN NaN NaN
4 2021-12-14T00:00:00.000 8:13 BROOKLYN 11233.0 40.683304 -73.917274 \n, \n(40.683304, -73.917274) SARATOGA AVENUE DECATUR STREET NaN ... NaN NaN NaN NaN 4486609 NaN NaN NaN NaN NaN

5 rows × 29 columns

print(crash_data.columns)
Index(['crash_date', 'crash_time', 'borough', 'zip_code', 'latitude',
       'longitude', 'location', 'on_street_name', 'off_street_name',
       'cross_street_name', 'number_of_persons_injured',
       'number_of_persons_killed', 'number_of_pedestrians_injured',
       'number_of_pedestrians_killed', 'number_of_cyclist_injured',
       'number_of_cyclist_killed', 'number_of_motorist_injured',
       'number_of_motorist_killed', 'contributing_factor_vehicle_1',
       'contributing_factor_vehicle_2', 'contributing_factor_vehicle_3',
       'contributing_factor_vehicle_4', 'contributing_factor_vehicle_5',
       'collision_id', 'vehicle_type_code1', 'vehicle_type_code2',
       'vehicle_type_code_3', 'vehicle_type_code_4', 'vehicle_type_code_5'],
      dtype='object')
cleaned_data = crash_data[["contributing_factor_vehicle_1"]].dropna()
cleaned_data.head()
contributing_factor_vehicle_1
0 Aggressive Driving/Road Rage
1 Pavement Slippery
2 Following Too Closely
3 Unspecified
5 Unspecified
crash_data.describe()
zip_code latitude longitude number_of_persons_injured number_of_persons_killed number_of_pedestrians_injured number_of_pedestrians_killed number_of_cyclist_injured number_of_cyclist_killed number_of_motorist_injured number_of_motorist_killed collision_id
count 632.000000 924.000000 924.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.0 1000.000000 1000.000000 1.000000e+03
mean 10907.625000 40.413002 -73.360506 0.454000 0.004000 0.074000 0.003000 0.038000 0.0 0.333000 0.001000 4.440555e+06
std 516.926995 3.533802 6.413563 0.786449 0.063151 0.265696 0.054717 0.191292 0.0 0.768571 0.031623 4.151893e+04
min 10001.000000 0.000000 -74.200980 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 4.136992e+06
25% 10459.000000 40.660920 -73.966905 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 4.407764e+06
50% 11209.000000 40.708324 -73.927270 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 4.408298e+06
75% 11236.000000 40.783232 -73.872460 1.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 4.456829e+06
max 11692.000000 40.905174 0.000000 7.000000 1.000000 2.000000 1.000000 1.000000 0.0 7.000000 1.000000 4.545699e+06
fig = px.histogram(
    cleaned_data, 
    x = "contributing_factor_vehicle_1",
    title="Frequency of Contributing Factors in Vehicle Crashes",
    labels = {"contributing_factor_vehicle_1": "Contributing Factor"},
    text_auto = True,)

fig.update_layout(
    xaxis = {'categoryorder':'total descending'}, # sort by frequency
    bargap = 0.2
    )
fig.show()
url = 'https://data.cityofnewyork.us/resource/k397-673e.csv?$query=SELECT%0A%20%20%60fiscal_year%60%2C%0A%20%20%60payroll_number%60%2C%0A%20%20%60agency_name%60%2C%0A%20%20%60last_name%60%2C%0A%20%20%60first_name%60%2C%0A%20%20%60mid_init%60%2C%0A%20%20%60agency_start_date%60%2C%0A%20%20%60work_location_borough%60%2C%0A%20%20%60title_description%60%2C%0A%20%20%60leave_status_as_of_june_30%60%2C%0A%20%20%60base_salary%60%2C%0A%20%20%60pay_basis%60%2C%0A%20%20%60regular_hours%60%2C%0A%20%20%60regular_gross_paid%60%2C%0A%20%20%60ot_hours%60%2C%0A%20%20%60total_ot_paid%60%2C%0A%20%20%60total_other_pay%60%0AORDER%20BY%20%60agency_name%60%20ASC%20NULL%20LAST%2C%20%60fiscal_year%60%20DESC%20NULL%20FIRST'

pay = pd.read_csv(url)
pay.head()
fiscal_year payroll_number agency_name last_name first_name mid_init agency_start_date work_location_borough title_description leave_status_as_of_june_30 base_salary pay_basis regular_hours regular_gross_paid ot_hours total_ot_paid total_other_pay
0 2024 67 ADMIN FOR CHILDREN'S SVCS WALTHOUR ALIZE B 2024-05-20T00:00:00.000 BROOKLYN YOUTH DEVELOPMENT SPECIALIST ACTIVE 51502.0 per Annum 140.0 3961.86 0.00 0.00 0.00
1 2024 67 ADMIN FOR CHILDREN'S SVCS PARSONS MARISSA L 2012-10-22T00:00:00.000 BRONX CHILD PROTECTIVE SPECIALIST ACTIVE 67899.0 per Annum 1820.0 65567.23 195.25 9185.74 6174.07
2 2024 67 ADMIN FOR CHILDREN'S SVCS ADELEKE GAILANN A 2019-07-22T00:00:00.000 RICHMOND CHILD PROTECTIVE SPECIALIST CEASED 60545.0 per Annum 0.0 -120.75 0.00 0.00 0.00
3 2024 67 ADMIN FOR CHILDREN'S SVCS MCDONALD ALIZE NaN 2024-05-20T00:00:00.000 BRONX YOUTH DEVELOPMENT SPECIALIST ACTIVE 51502.0 per Annum 140.0 3959.13 0.00 0.00 0.00
4 2024 67 ADMIN FOR CHILDREN'S SVCS FAISON NAOMI M 2023-12-18T00:00:00.000 MANHATTAN SPACE ANALYST ACTIVE 76262.0 per Annum 980.0 37216.71 0.00 0.00 0.00
pay.info()

# we see that mid init has nulls 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   fiscal_year                 1000 non-null   int64  
 1   payroll_number              1000 non-null   int64  
 2   agency_name                 1000 non-null   object 
 3   last_name                   1000 non-null   object 
 4   first_name                  1000 non-null   object 
 5   mid_init                    720 non-null    object 
 6   agency_start_date           1000 non-null   object 
 7   work_location_borough       1000 non-null   object 
 8   title_description           1000 non-null   object 
 9   leave_status_as_of_june_30  1000 non-null   object 
 10  base_salary                 1000 non-null   float64
 11  pay_basis                   1000 non-null   object 
 12  regular_hours               1000 non-null   float64
 13  regular_gross_paid          1000 non-null   float64
 14  ot_hours                    1000 non-null   float64
 15  total_ot_paid               1000 non-null   float64
 16  total_other_pay             1000 non-null   float64
dtypes: float64(6), int64(2), object(9)
memory usage: 132.9+ KB
pay.describe()
fiscal_year payroll_number base_salary regular_hours regular_gross_paid ot_hours total_ot_paid total_other_pay
count 1000.0 1000.0 1000.000000 1000.000000 1000.000000 1000.00000 1000.000000 1000.000000
mean 2024.0 67.0 65178.088900 530.389060 20603.270420 35.01600 1857.951720 370.303600
std 0.0 0.0 26263.957005 724.132809 30046.075524 161.21279 9122.247658 1945.330653
min 2024.0 67.0 16.000000 -320.000000 -13454.610000 -68.00000 -2512.420000 -17568.630000
25% 2024.0 67.0 51502.000000 0.000000 297.985000 0.00000 0.000000 0.000000
50% 2024.0 67.0 61373.500000 140.000000 4065.780000 0.00000 0.000000 0.000000
75% 2024.0 67.0 76127.000000 980.000000 34504.695000 0.00000 0.000000 0.000000
max 2024.0 67.0 193743.000000 2080.000000 140675.970000 1555.50000 99681.540000 6177.250000
pay_clean = pay[["base_salary", "total_other_pay", "fiscal_year"]]

pay_clean.head()

# if you put single bracket: you are indexing 
base_salary total_other_pay fiscal_year
0 51502.0 0.00 2024
1 67899.0 6174.07 2024
2 60545.0 0.00 2024
3 51502.0 0.00 2024
4 76262.0 0.00 2024
fig2 = px.scatter(
    pay_clean,
    x = "base_salary",
    y = "total_other_pay",
    color = "fiscal_year"
)

fig2.show()

Step 0#

Read the data from the New York City Population By Community Districts data set into a DataFrame called pop_by_cd. To get the URL:

  1. Visit the page linked above.

  2. Click Export.

  3. Right-click CSV.

  4. Click Copy Link Address (or Location, depending on your browser).

url = 'https://data.cityofnewyork.us/api/views/xi7c-iiu2/rows.csv?accessType=DOWNLOAD'
pop_by_cd = pd.read_csv(url)

pop_by_cd.head()
Borough CD Number CD Name 1970 Population 1980 Population 1990 Population 2000 Population 2010 Population
0 Bronx 1 Melrose, Mott Haven, Port Morris 138557 78441 77214 82159 91497
1 Bronx 2 Hunts Point, Longwood 99493 34399 39443 46824 52246
2 Bronx 3 Morrisania, Crotona Park East 150636 53635 57162 68574 79762
3 Bronx 4 Highbridge, Concourse Village 144207 114312 119962 139563 146441
4 Bronx 5 University Hts., Fordham, Mt. Hope 121807 107995 118435 128313 128200

Step 1#

Prepare the data. Use the following code to reshape the DataFrame to have one row per community district per Census year.

# turn the population columns into rows
populations = pd.melt(
    pop_by_cd,
    id_vars=["Borough", "CD Number", "CD Name"],
    var_name="year",
    value_name="population",
)

# turn the years into numbers
populations.year = populations.year.str.replace(" Population", "").astype(int)

populations
Borough CD Number CD Name year population
0 Bronx 1 Melrose, Mott Haven, Port Morris 1970 138557
1 Bronx 2 Hunts Point, Longwood 1970 99493
2 Bronx 3 Morrisania, Crotona Park East 1970 150636
3 Bronx 4 Highbridge, Concourse Village 1970 144207
4 Bronx 5 University Hts., Fordham, Mt. Hope 1970 121807
... ... ... ... ... ...
290 Queens 13 Queens Village, Rosedale 2010 188593
291 Queens 14 The Rockaways, Broad Channel 2010 114978
292 Staten Island 1 Stapleton, Port Richmond 2010 175756
293 Staten Island 2 New Springville, South Beach 2010 132003
294 Staten Island 3 Tottenville, Woodrow, Great Kills 2010 160209

295 rows × 5 columns

Step 2#

Create a line chart of the population over time for each community district in Manhattan. There should be one line for each.

manhattan = populations[populations["Borough"] == "Manhattan"]
manhattan.head()
Borough CD Number CD Name year population
30 Manhattan 1 Battery Park City, Tribeca 1970 7706
31 Manhattan 2 Greenwich Village, Soho 1970 84337
32 Manhattan 3 Lower East Side, Chinatown 1970 181845
33 Manhattan 4 Chelsea, Clinton 1970 83601
34 Manhattan 5 Midtown Business District 1970 31076
# your code here
fig3 = px.line(
    manhattan,
    x = "year",
    y = "population",
    color = "CD Name",
    title = "Population Over Time for Each Community District in Manhattan",
    labels = {"year":"Year", "population":"Population", "CD Name": "Community District"}
)


fig3.show()

Step 3#

Starting with the same dataset, create a line chart of the population over time for each Borough. There should be one line for each.

city_pop = populations.groupby(["Borough", "year"],as_index= False)['population'].sum() 

city_pop.head()
Borough year population
0 Bronx 1970 1478524
1 Bronx 1980 1170349
2 Bronx 1990 1197420
3 Bronx 2000 1327454
4 Bronx 2010 1380697
fig4 = px.line(
    city_pop,
    x = "year",
    y = "population",
    color = "Borough",
    title = "Population Over Time for Each Borough in NYC",
    labels = {"year":"Year", "population":"Population", }
)


fig4.show()

Step 4#

Submit via Gradescope.